Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.
library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Coleps_irchel/Morph_mvt.RData")
dd25 <- morph_mvt
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Paramecium_bursaria/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Dexiostoma/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Loxocephallus/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Cryptomonas/Morph_mvt.RData")
morph_mvt$species <- "Cryptomonas"
dd25 <- rbind(dd25, morph_mvt)
load("../../Class_18C_normalLight/1st_class_2020/data/25x/Debris/Morph_mvt.RData")
morph_mvt$species <- "Debris_and_other"
morph_mvt <- morph_mvt %>% filter(mean_area<700)
dd25 <- full_join(dd25, morph_mvt)
# filtering
dd25 <- dd25 %>%
filter(net_disp>50, net_speed>5, N_frames>10)%>%
select(-edible_algae,-microcosm.nr)
dd25$magnification <- 2.5
# table(dd25$species)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Coleps_irchel/Morph_mvt.RData")
dd25_2022feb <- morph_mvt
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Dexiostoma/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Euplotes/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Loxochephalus//Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Paramecium_bursaria/Morph_mvt.RData")
# morph_mvt %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# geom_vline(xintercept = 3000)
morph_mvt <- morph_mvt %>% dplyr::filter(mean_area>3000)
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Stylonychia2_batch1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch1"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Stylonychia2_batch2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch2"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Tetrahymena/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)
# filtering
dd25_2022feb <- dd25_2022feb %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_2022feb$magnification <- 2.5
# table(dd25_2022feb$species)
load("../../Class_18C_normalLight/3rd_data_20220301/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220301 <- morph_mvt
load("../../Class_18C_normalLight/3rd_data_20220301/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220301 <- rbind(dd25_20220301, morph_mvt)
# filtering
dd25_20220301 <- dd25_20220301 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220301$magnification <- 2.5
# dd25_20220301 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220301 <- dd25_20220301 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000))
# table(dd25_20220301$species)
Paramecium_caudatum maybe needs more cleaning
load("../../Class_18C_normalLight/4th_data_20220316/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220316 <- morph_mvt
load("../../Class_18C_normalLight/4th_data_20220316/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220316 <- rbind(dd25_20220316, morph_mvt)
# filtering
dd25_20220316 <- dd25_20220316 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220316$magnification <- 2.5
# dd25_20220316 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220316 <- dd25_20220316 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes","Stylonychia_2") & mean_area<2000))
# table(dd25_20220316$species)
load("../../Class_18C_normalLight/5th_data_20220406/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220406 <- morph_mvt
load("../../Class_18C_normalLight/5th_data_20220406/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220406 <- rbind(dd25_20220406, morph_mvt)
# filtering
dd25_20220406 <- dd25_20220406 %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220406$magnification <- 2.5
# dd25_20220406 %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220406 <- dd25_20220406 %>%
dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes","Stylonychia_2") & mean_area<2000))
# table(dd25_20220406$species)
load("../../Class_18C_decreasingLight/Light_18perc/data/25x/Euplotes/Morph_mvt.RData")
dd25_2022feb_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_18perc/data/25x/Paramecium_bursaria/Morph_mvt.RData")
a <- morph_mvt %>%
ggplot(aes(mean_area))+
geom_histogram()
Pburs_dark_25x <- morph_mvt %>%
dplyr::select(mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
sd_major,mean_ar,sd_ar,
mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed)
set.seed(23)
Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 2, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
b <- Pburs_dark_25x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 3, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
c <- Pburs_dark_25x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 4, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
d <- Pburs_dark_25x %>%
ggplot(aes(mean_area, col=cluster, fill=cluster))+
geom_histogram(position = "identity", alpha=0.3)
morph_mvt <- morph_mvt %>%
dplyr::mutate(cluster=as.factor(Pburs_dark_25x_clust$cluster)) %>%
dplyr::filter(cluster !="4") %>%
dplyr::select(-cluster)
e <- morph_mvt %>%
ggplot(aes(mean_area))+
geom_histogram()
# a + b + c + d + e + plot_annotation(title = "Cleaning of P.burs dark 25x")
dd25_2022feb_decrease <- rbind(dd25_2022feb_decrease, morph_mvt)
# filtering
dd25_2022feb_decrease <- dd25_2022feb_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_2022feb_decrease$magnification <- 2.5
# table(dd25_2022feb_decrease$species)
load("../../Class_18C_decreasingLight/Light_10perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220301_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_10perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220301_decrease <- rbind(dd25_20220301_decrease, morph_mvt)
# filtering
dd25_20220301_decrease <- dd25_20220301_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220301_decrease$magnification <- 2.5
# dd25_20220301_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220301_decrease <- dd25_20220301_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2500),
!(species %in% c("Coleps_irchel") & mean_area<1000))
# table(dd25_20220301_decrease$species)
load("../../Class_18C_decreasingLight/Light_6perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220316_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_6perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220316_decrease <- rbind(dd25_20220316_decrease, morph_mvt)
# filtering
dd25_20220316_decrease <- dd25_20220316_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220316_decrease$magnification <- 2.5
# dd25_20220316_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220316_decrease <- dd25_20220316_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000))
# table(dd25_20220316_decrease$species)
load("../../Class_18C_decreasingLight/Light_1perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220406_decrease <- morph_mvt
load("../../Class_18C_decreasingLight/Light_1perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220406_decrease <- rbind(dd25_20220406_decrease, morph_mvt)
# filtering
dd25_20220406_decrease <- dd25_20220406_decrease %>%
filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220406_decrease$magnification <- 2.5
# dd25_20220406_decrease %>%
# ggplot(aes(mean_area))+
# geom_histogram()+
# facet_wrap(~species, scales = "free")
dd25_20220406_decrease <- dd25_20220406_decrease %>%
dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000))
# table(dd25_20220406_decrease$species)
dd25$data <- "Late 2020"
dd25_2022feb$data <- "20220201"
dd25_2022feb_decrease$data <- "20220201"
dd25_20220301$data <- "20220301"
dd25_20220301_decrease$data <- "20220301"
dd25_20220316$data <- "20220316"
dd25_20220316_decrease$data <- "20220316"
dd25_20220406$data <- "20220406"
dd25_20220406_decrease$data <- "20220406"
dd_30perc <- rbind(dd25,
dd25_2022feb,dd25_20220301 %>% dplyr::select(-Light_dark),
dd25_20220316 %>% dplyr::select(-Light_dark),
dd25_20220406 %>% dplyr::select(-Light_dark))
dd_18perc <- dd25_2022feb_decrease
dd_10perc <- dd25_20220301_decrease %>%
dplyr::select(-Light_dark)
dd_6perc <- dd25_20220316_decrease %>%
dplyr::select(-Light_dark)
dd_1perc <- dd25_20220406_decrease %>%
dplyr::select(-Light_dark)
dd_30perc$light <- "30%"
dd_18perc$light <- "18%"
dd_10perc$light <- "10%"
dd_6perc$light <- "6%"
dd_1perc$light <- "1%"
dd <- rbind(dd_30perc,dd_18perc,dd_10perc,dd_6perc,dd_1perc)
dd$species <- ifelse(dd$species == "Stylonychia_2", "Stylonychia2", dd$species)
dd$species <- ifelse(dd$species %in% c("Stylonychia2_batch1","Stylonychia2_batch2"), "Stylonychia2", dd$species)
# table(dd$species, dd$data, dd$magnification, dd$light)
If an individual is an outlier in more than 3 traits it gets excluded (about 7% gets excluded). Outlier based on boxplot definition.
dd$id <- 1:nrow(dd)
dd <- dd %>% na.omit()
dd_long <- dd %>%
dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
sd_major,mean_ar,sd_ar,
mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id,
data, light, magnification) %>%
pivot_longer(cols=-c(id,species,data,light,magnification), names_to="variable") %>%
dplyr::group_by(variable,species,data,light,magnification) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id, data, light, magnification) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>3)
# table(outliers$species)
# nrow(outliers)/nrow(dd)
dd_filtered <- dd %>%
dplyr::filter(!is.element(id,outliers$id))
dd$removed_outliers <- F
dd_filtered$removed_outliers <- T
dd_comparison <- rbind(dd,dd_filtered)
dd <- dd_filtered
# dd_comparison %>%
# ggplot(aes((mean_area), col=removed_outliers))+
# geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
# # geom_boxplot(outlier.alpha = 0.3) +
# theme_bw() +
# facet_wrap(species~interaction(light,data), scales = "free")
#
# dd_filtered %>%
# ggplot(aes(log10(mean_area)))+
# geom_density(aes(col=species))
# table(dd$species, dd$data, dd$magnification, dd$light)
We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.
dd <- dd %>%
mutate(data.light = interaction(data,light, drop = T),
data.light.species=interaction(data,light,species, drop = T))
print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##
## Coleps_irchel Colpidium Cryptomonas Debris_and_other Dexiostoma
## 20220406.1% 15 154 0 0 15
## 20220301.10% 614 184 0 0 342
## 20220201.18% 0 0 0 0 0
## 20220201.30% 904 330 0 0 259
## 20220301.30% 587 193 0 0 317
## 20220316.30% 91 234 0 0 199
## 20220406.30% 73 93 0 0 100
## Late 2020.30% 531 155 3475 623 649
## 20220316.6% 147 212 0 0 120
##
## Euplotes Loxocephallus Paramecium_bursaria Paramecium_caudatum
## 20220406.1% 35 362 118 137
## 20220301.10% 33 1041 87 370
## 20220201.18% 708 0 1230 0
## 20220201.30% 228 967 488 445
## 20220301.30% 55 834 122 564
## 20220316.30% 136 687 860 786
## 20220406.30% 148 403 587 329
## Late 2020.30% 836 1090 1342 1017
## 20220316.6% 354 695 751 780
##
## Stylonychia1 Stylonychia2 Tetrahymena
## 20220406.1% 0 148 304
## 20220301.10% 0 448 1027
## 20220201.18% 0 0 0
## 20220201.30% 0 538 1035
## 20220301.30% 0 942 974
## 20220316.30% 0 642 1086
## 20220406.30% 0 625 554
## Late 2020.30% 345 146 1176
## 20220316.6% 0 469 676
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
## Coleps_irchel Colpidium Cryptomonas Debris_and_other
## 2962 1555 3475 623
## Dexiostoma Euplotes Loxocephallus Paramecium_bursaria
## 2001 2533 6079 5585
## Paramecium_caudatum Stylonychia1 Stylonychia2 Tetrahymena
## 4428 345 3958 6832
Of course, Stylo1 is the one we the least individuals, because it went extinct and we do not have it anymore. As it went extinct, it shouldn’t be the limiting factor, I think. For now at least I’m taking 3 * Stylo1_sum as the number of individuals per species to be included (if a species/class has less than that all individuals are included).
Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.
n <- min(colsums)*3
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1035"
num_ind_per_class <- dd %>% group_by(species) %>%
summarize(num_class = length(unique(data.light.species)),
num_ind_per_class = floor(n/num_class)) %>%
select(species,num_ind_per_class)
num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species
set.seed(53)
split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
species <- unique(x$species)
x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)
print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##
## Coleps_irchel Colpidium Cryptomonas Debris_and_other Dexiostoma
## 20220406.1% 15 129 0 0 15
## 20220301.10% 129 129 0 0 129
## 20220201.18% 0 0 0 0 0
## 20220201.30% 129 129 0 0 129
## 20220301.30% 129 129 0 0 129
## 20220316.30% 91 129 0 0 129
## 20220406.30% 73 93 0 0 100
## Late 2020.30% 129 129 1035 623 129
## 20220316.6% 129 129 0 0 120
##
## Euplotes Loxocephallus Paramecium_bursaria Paramecium_caudatum
## 20220406.1% 35 129 115 129
## 20220301.10% 33 129 87 129
## 20220201.18% 115 0 115 0
## 20220201.30% 115 129 115 129
## 20220301.30% 55 129 115 129
## 20220316.30% 115 129 115 129
## 20220406.30% 115 129 115 129
## Late 2020.30% 115 129 115 129
## 20220316.6% 115 129 115 129
##
## Stylonychia1 Stylonychia2 Tetrahymena
## 20220406.1% 0 129 129
## 20220301.10% 0 129 129
## 20220201.18% 0 0 0
## 20220201.30% 0 129 129
## 20220301.30% 0 129 129
## 20220316.30% 0 129 129
## 20220406.30% 0 129 129
## Late 2020.30% 345 129 129
## 20220316.6% 0 129 129
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
## Coleps_irchel Colpidium Cryptomonas Debris_and_other
## 824 996 1035 623
## Dexiostoma Euplotes Loxocephallus Paramecium_bursaria
## 880 813 1032 1007
## Paramecium_caudatum Stylonychia1 Stylonychia2 Tetrahymena
## 1032 345 1032 1032
trainingdata <- trainingdata[complete.cases(trainingdata),]
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.75, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
species <- unique(dd$species)
species <- species[!is.element(species,c("Debris_and_other","Cryptomonas"))]
compositions <- read_csv("../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
## filterting out Stylo1
compositions_list <- lapply(compositions_list, function(x){
x[x!="Stylonychia1"]
})
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
set.seed(474)
classifiers_18c_25x <- list()
classifiers_18c_25x_plots <- list()
classifiers_18c_25x_assess_plots <- list()
for(i in 1:length(compositions_list)){
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
sub_trainingdata$species <- factor(sub_trainingdata$species)
sub_testdata$species <- factor(sub_testdata$species)
# split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
# subsamples <- lapply(split_up, function(x) {
# x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
# sub_trainingdata <- do.call("rbind", subsamples)
# # A stratified random split of the data
# idx_train <- createDataPartition(sub_trainingdata$species,
# p = 0.7, # percentage of data as training
# list = FALSE)
#
# sub_testdata <- sub_trainingdata[-idx_train,]
# sub_trainingdata <- sub_trainingdata[idx_train,]
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c_25x[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_25x_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_25x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_25x_plots) <- names(classifiers_18c_25x) <-
names(classifiers_18c_25x_assess_plots) <- comp_name
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
library(patchwork)
for(i in 1:length(classifiers_18c_25x_plots)){
print(classifiers_18c_25x_plots[[i]] + classifiers_18c_25x_assess_plots[[i]] +
plot_layout(widths = c(4,2)))
}
saveRDS(classifiers_18c_25x, "svm_video_classifiers_18c_25x_20220316_MergedData.rds")
# cl <- readRDS("classifiers_18c_25x.rds")